# !/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
from sklearn.preprocessing import StandardScaler  # 导入数据标准化包
from scipy.spatial import distance
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error  # 计算均方根误差

def sinplot():
    dc_listings = pd.read_csv('E:\培训教程\python\唐宇迪-机器学习课程\机器学习算法配套案例实战\K近邻\listings.csv')
    pd.options.display.max_columns = 50
    features = ['accommodates', 'bedrooms', 'bathrooms', 'beds', 'price', 'minimum_nights', 'maximum_nights','number_of_reviews']
    dc_listings = dc_listings[features].head(200)
    # 价格中的字符处理并将数据类型转为float
    dc_listings['price'] = dc_listings.price.str.replace("$", '').replace(' ', '').astype(float)
    dc_listings = dc_listings.dropna()  # 当行中有空数据时，直接删除

    # 对数据进行标准化，标准化之后，数据的均值为0，标准差为1
    dc_listings[features] = StandardScaler().fit_transform(dc_listings[features])
    normalized_listings = dc_listings
    print(dc_listings.mean())

    norm_train_df = normalized_listings.copy().iloc[0:150]
    norm_test_df = normalized_listings.copy().iloc[150:]

    # scipy中已经有现成的距离的计算工具了
    first_listing = normalized_listings.iloc[0][['accommodates', 'bathrooms']]
    fifth_listing = normalized_listings.iloc[20][['accommodates', 'bathrooms']]
    first_fifth_distance = distance.euclidean(first_listing, fifth_listing)
    print('first_fifth_distance : ', first_fifth_distance)

    # 使用Sklearn来完成KNN
    cols = ['accommodates', 'bedrooms']
    # 实例化knn对象
    knn = KNeighborsRegressor()
    # fit函数会训练模型，使用训练集的数据，第一个参数为数据，第二个参数为标签，相当于x和y
    # 用指定的列，和标签之间的关系来训练模型，达到预测测试数据集的目的
    knn.fit(norm_train_df[cols], norm_train_df['price'])
    # 使用训练集训练完成之后的模型，对测试集数据进行预测
    # predict会遍历norm_test_df，每一行生成一个two_features_predictions，返回的数据类型为numpy，长度和norm_test_df相同
    two_features_predictions = knn.predict(norm_test_df[cols])
    # 计算实际的价格和预测出来的价格的差异,每一个真实的价格的平方和相加，在取平均是，在取0.5次幂
    # 计算的机构就是均方跟误差，用于评估模型的好坏，值越小，模型的准确率越高
    two_features_mse = mean_squared_error(norm_test_df['price'], two_features_predictions)
    two_features_rmse = two_features_mse ** (1 / 2)
    print(two_features_rmse)

    print(knn.score(norm_test_df, two_features_predictions))
if __name__ == '__main__':
        sinplot()